title:"Statistics Assignment"
author: "Himal Baskota"
date: "2025/05/11" 
Sys.setenv("R_INTERACTIVE_DEVICE" = "png")
options(repos = c(CRAN = "https://cran.r-project.org"))

if (!require("rsample")) install.packages("rsample")
## Loading required package: rsample
if (!require("ggplot2")) install.packages("ggplot2")
## Loading required package: ggplot2

#importing needed library

library(rsample)
library(ggplot2)      # For creating plots

#importing input data from csv file

X=as.matrix(read.csv(file="/Volumes/Mac\'s\ Drive\ /Himal/Softwarica/stat/r-solution/x.csv",header = F))
colnames(X)<-c("x1","x3","x4","x5")

#displaying imported data

head(X)
##         x1    x3      x4    x5
## [1,]  8.34 40.77 1010.84 90.01
## [2,] 23.64 58.49 1011.40 74.20
## [3,] 29.74 56.90 1007.15 41.91
## [4,] 19.07 49.69 1007.22 76.79
## [5,] 11.80 40.66 1017.13 97.20
## [6,] 13.97 39.16 1016.05 84.60

#importing Output data from y.csv

Y=as.matrix(read.csv(file="/Volumes/Mac\'s\ Drive\ /Himal/Softwarica/stat/r-solution/y.csv",header = F))
colnames(Y)<-c("y")

#displaying output data

head(Y)
##           y
## [1,] 480.48
## [2,] 445.75
## [3,] 438.76
## [4,] 453.09
## [5,] 464.43
## [6,] 470.96

#importing time series data

time = read.csv("/Volumes/Mac\'s\ Drive\ /Himal/Softwarica/stat/r-solution/t.csv", header = F)
time = as.matrix(time)

#displaying time series data

head(time)
##      V1
## [1,]  1
## [2,]  2
## [3,]  3
## [4,]  4
## [5,]  5
## [6,]  6

#task 1.1 #plotting time series data

X.ts<-ts(X,start = c(min(time),max(time)),frequency =1)
Y.ts<-ts(Y,start = c(min(time),max(time)),frequency =1)

#plotting timeseries data of input and target variable

plot(X.ts[0:50,1],main = "Time series plot of Ambient Temperature", xlab = "Time", ylab = "°C" , col = "#1130ee", type = "l")

plot(X.ts[0:50,2],main = "Time series plot of Atmospheric Pressure", xlab = "Time", ylab = "millibar" , col = "#1130ee", type = "l")

plot(X.ts[0:50,3],main = "Time series plot of Humidity Level", xlab = "Time", ylab = "%" , col = "#1130ee", type = "l")

plot(X.ts[0:50,4],main = "Time series plot of Exhaust Vaccum", xlab = "Time", ylab = "cm Hg" , col = "#1130ee", type = "l")

plot(Y.ts[400:450],main = "Time series plot of Energy Output", xlab = "Time", ylab = "MegaWatt" , col = "#1130ee", type = "l")

task 1.2 : Plotting distribution of each input EEG signal

#Creating a density of all input signal X

density_x1=density(X[,"x1"])
density_x3 = density(X[,"x3"])
density_x4 = density(X[,"x4"])
density_x5 = density(X[,"x5"])

# Find the maximum y-value across all densities to set y-axis limit
max_y = max(
  max(density_x1$y),
  max(density_x3$y),
  max(density_x4$y),
  max(density_x5$y)
)
# Set up the plot area with extra space for the legend
par(mar = c(5, 10, 4, 8))  # c(bottom, left, top, right)

plot(density_x1, 
     main = "Density Plots of Input Signals",
     xlab = "Values", 
     ylab = "Density",
     col = "#1130ee",  # Red color for temperature
     ylim = c(0, max_y),
     xlim = c(0,1200),
     lwd = 2)

lines(density_x3, col = "#47b86d", lwd = 2)  # Teal for pressure
lines(density_x4, col = "#996668", lwd = 2)  # Brown for humidity
lines(density_x5, col = "#cedd22", lwd = 2)  # Slateblue for vacuum
par(xpd = TRUE)  # Allow plotting outside the figure region
# Add a legend
legend("topright", 
       legend = c("Temperature (x1)", "Pressure (x3)", "Humidity (x4)", "Vacuum (x5)"),
       col = c("#1130ee", "#47b86d", "#c33c63", "#cedd22"),
       lwd = 2)

par(xpd = FALSE)  # Reset to default

Creating a Histogram of X signal

hist(X,freq = FALSE,main = "Density plot of Input Signals",xlab = "Distribution", col = "#47b86d")

#combining Histogram of X signal with density plot

density_of_X = density(X)
hist(X,freq = FALSE,main = "Density plot with Density line plot of Input signals", xlab = "Distribution", col = "#47b86d")
lines(density_of_X,lwd=2,col="#1130ee")
rug(jitter(X))

#histogram and density plot of individual input signal X and output signal y

#Creating a density plot of input signal X1 
density_of_X1=density(X[,"x1"])
hist(X[,"x1"],freq = FALSE,main = "Histogram and density plot of Temperature",xlab = "°C", col = "#47b86d")
lines(density_of_X1,lwd=2,col="#1130ee")
# Add the data-points with noise in the X-axis
rug(jitter(X[,"x1"]))

#Creating a density plot of input signal X3 
density_of_X3=density(X[,"x3"])
hist(X[,"x3"],freq = FALSE,main = "Histogram and density plot of Pressure",xlab = "millibar", col = "#47b86d")
lines(density_of_X3,lwd=2,col="#1130ee")
# Add the data-points with noise in the X-axis
rug(jitter(X[,"x3"]))

#Creating a density plot of input signal X4
density_of_X4=density(X[,"x4"])
hist(X[,"x4"],freq = FALSE,main = "Histogram and density plot of Humidity",xlab = "%", col = "#47b86d")
lines(density_of_X4,lwd=2,col="#1130ee")
# Add the data-points with noise in the X-axis
rug(jitter(X[,"x4"]))

#Creating a density plot of input signal X5
density_of_X5=density(X[,"x5"])
hist(X[,"x5"],freq = FALSE,main = "Histogram and density plot of Vacuum",xlab = "cm Hg", col = "#47b86d")
lines(density_of_X5,lwd=2,col="#1130ee")
# Add the data-points with noise in the X-axis
rug(jitter(X[,"x5"]))

#Creating a density plot of output signal y
density_of_y=density(Y[,"y"])
hist(Y[,"y"],freq = FALSE,main = "Histogram and density plot of Energy Output",xlab = "MegaWatt", col = "#47b86d")
lines(density_of_y,lwd=2,col="#1130ee")
# Add the data-points with noise in the X-axis
rug(jitter(Y[,"y"]))

Task 1.3 creating scatter plots to indeitify correlation

arrenging plot in a single screen

par(mfrow=c(2,2))

# Plotting input signal X1 against output signal Y
plot(X[,"x1"],Y,main = "Correlation betweeen X1 and Y signal", xlab = "X1 signal", ylab = "Output signal y", col = "#47b86d")

# Plotting input signal X3 against output signal Y
plot(X[,"x3"],Y,main = "Correlation betweeen X3 and Y signal", xlab = "X3 signal", ylab = "Output signal y", col = "#47b86d")

# Plotting input signal X4 against output signal Y
plot(X[,"x4"],Y,main = "Correlation betweeen X4 and Y signal", xlab = "X4 signal", ylab = "Output signal y", col = "#47b86d")

# Plotting input signal X4 against output signal Y
plot(X[,"x5"],Y,main = "Correlation betweeen X5 and Y signal", xlab = "X5 signal", ylab = "Output signal y", col = "#47b86d")

# Q-Q plots to check for normality
qqnorm(X[,"x1"], col = "#47b86d",main = "QQ plot of Temperature")
qqline(X[,"x1"], col = "#1130ee")

qqnorm(X[,"x3"], col = "#47b86d",main = "QQ plot of Pressure")
qqline(X[,"x3"], col = "#1130ee")

qqnorm(X[,"x4"], col = "#47b86d",main = "QQ plot of Humidity")
qqline(X[,"x4"], col = "#1130ee")

qqnorm(X[,"x5"], col = "#47b86d",main = "QQ plot of Vacuum")
qqline(X[,"x5"], col = "#1130ee")

Task 2

Calculating ones for binding the data

ones = matrix(1 , length(X)/4,1)
head(ones)
##      [,1]
## [1,]    1
## [2,]    1
## [3,]    1
## [4,]    1
## [5,]    1
## [6,]    1

Task 2.1

Calculating thetahat of each candidate model

#Binding data from equation of model 1.
# Model 1 predictors

X_model1_raw <- cbind(X[,"x4"], X[,"x3"]^2)

# Scale predictors (not the intercept)
X_model1_scaled <- scale(X_model1_raw)

# Add intercept back
X_model1 <- cbind(ones, X_model1_scaled)

head(X_model1)
##      [,1]       [,2]        [,3]
## [1,]    1 -0.4073356 -1.02213385
## [2,]    1 -0.3130402  0.21910945
## [3,]    1 -1.0286750  0.08963496
## [4,]    1 -1.0168880 -0.45270382
## [5,]    1  0.6518038 -1.02845500
## [6,]    1  0.4699484 -1.11294823
#Calculating thetahat of Model 1
Model1_thetahat=solve(t(X_model1) %*% X_model1) %*% t(X_model1) %*% Y
Model1_thetahat
##               y
## [1,] 454.365009
## [2,]   3.348278
## [3,] -13.211452
#For model 2
#Binding data from equation of model 2.
# Model 2 predictors
X_model2_raw <- cbind(X[,"x4"],X[,"x3"]^2,X[,"x5"])

# Scale predictors (not the intercept)
X_model2_scaled <- scale(X_model2_raw)

# Add intercept back
X_model2 <- cbind(ones, X_model2_scaled)

head(X_model2)
##      [,1]       [,2]        [,3]        [,4]
## [1,]    1 -0.4073356 -1.02213385  1.14388457
## [2,]    1 -0.3130402  0.21910945  0.06102779
## [3,]    1 -1.0286750  0.08963496 -2.15057533
## [4,]    1 -1.0168880 -0.45270382  0.23842179
## [5,]    1  0.6518038 -1.02845500  1.63634126
## [6,]    1  0.4699484 -1.11294823  0.77334345
#Calculating thetahat of Model 2
Model2_thetahat=solve(t(X_model2) %*% X_model2) %*% t(X_model2) %*% Y
Model2_thetahat
##               y
## [1,] 454.365009
## [2,]   3.432657
## [3,] -12.406622
## [4,]   2.517326
#Model 3

#Binding data from equation of model 3.

# Model 3 predictors
X_model3_raw <- cbind(X[,"x3"],X[,"x4"],X[,"x5"]^3)

# Scale predictors (not the intercept)
X_model3_scaled <- scale(X_model3_raw)

# Add intercept back
X_model3 <- cbind(ones, X_model3_scaled)

head(X_model3)
##      [,1]       [,2]       [,3]        [,4]
## [1,]    1 -1.0651493 -0.4073356  1.26528251
## [2,]    1  0.3292596 -0.3130402 -0.13534394
## [3,]    1  0.2041406 -1.0286750 -1.59790073
## [4,]    1 -0.3632234 -1.0168880  0.05807104
## [5,]    1 -1.0738054  0.6518038  2.09103873
## [6,]    1 -1.1918422  0.4699484  0.72486945
#Calculating thetahat of Model 3
Model3_thetahat=solve(t(X_model3) %*% X_model3) %*% t(X_model3) %*% Y
Model3_thetahat
##               y
## [1,] 454.365009
## [2,] -12.722943
## [3,]   3.402683
## [4,]   2.315840
#For model 4
#Binding data from equation of model 4.

# Model 4 predictors
X_model4_raw <- cbind(X[,"x4"],(X[,"x3"])^2,(X[,"x5"])^3)

# Scale predictors (not the intercept)
X_model4_scaled <- scale(X_model4_raw)

# Add intercept back
X_model4 <- cbind(ones, X_model4_scaled)

head(X_model4)
##      [,1]       [,2]        [,3]        [,4]
## [1,]    1 -0.4073356 -1.02213385  1.26528251
## [2,]    1 -0.3130402  0.21910945 -0.13534394
## [3,]    1 -1.0286750  0.08963496 -1.59790073
## [4,]    1 -1.0168880 -0.45270382  0.05807104
## [5,]    1  0.6518038 -1.02845500  2.09103873
## [6,]    1  0.4699484 -1.11294823  0.72486945
#Calculating thetahat of Model 4
Model4_thetahat=solve(t(X_model4) %*% X_model4) %*% t(X_model4) %*% Y
Model4_thetahat
##               y
## [1,] 454.365009
## [2,]   3.487220
## [3,] -12.402038
## [4,]   2.487015
# for Model 5
#Binding data from equation of model 5.

# Model 5 predictors
X_model5_raw <- cbind((X[,"x4"]),(X[,"x1"])^2,(X[,"x3"])^2)

# Scale predictors (not the intercept)
X_model5_scaled <- scale(X_model5_raw)

# Add intercept back
X_model5 <- cbind(ones, X_model5_scaled)

head(X_model5)
##      [,1]       [,2]       [,3]        [,4]
## [1,]    1 -0.4073356 -1.2815789 -1.02213385
## [2,]    1 -0.3130402  0.4034159  0.21910945
## [3,]    1 -1.0286750  1.5247554  0.08963496
## [4,]    1 -1.0168880 -0.2687469 -0.45270382
## [5,]    1  0.6518038 -1.0416049 -1.02845500
## [6,]    1  0.4699484 -0.8490286 -1.11294823
#Calculating thetahat of Model 5
Model5_thetahat=solve(t(X_model5) %*% X_model5) %*% t(X_model5) %*% Y
Model5_thetahat
##               y
## [1,] 454.365009
## [2,]   1.349107
## [3,] -10.605331
## [4,]  -5.173124

printing value of theta of each model

#model1
Model1_thetahat
##               y
## [1,] 454.365009
## [2,]   3.348278
## [3,] -13.211452
t(Model1_thetahat)
##      [,1]     [,2]      [,3]
## y 454.365 3.348278 -13.21145
#model 2
Model2_thetahat
##               y
## [1,] 454.365009
## [2,]   3.432657
## [3,] -12.406622
## [4,]   2.517326
t(Model2_thetahat)
##      [,1]     [,2]      [,3]     [,4]
## y 454.365 3.432657 -12.40662 2.517326
#model 3
Model3_thetahat
##               y
## [1,] 454.365009
## [2,] -12.722943
## [3,]   3.402683
## [4,]   2.315840
t(Model3_thetahat)
##      [,1]      [,2]     [,3]    [,4]
## y 454.365 -12.72294 3.402683 2.31584
#model 4
Model4_thetahat
##               y
## [1,] 454.365009
## [2,]   3.487220
## [3,] -12.402038
## [4,]   2.487015
t(Model4_thetahat)
##      [,1]    [,2]      [,3]     [,4]
## y 454.365 3.48722 -12.40204 2.487015
#model 5
Model5_thetahat
##               y
## [1,] 454.365009
## [2,]   1.349107
## [3,] -10.605331
## [4,]  -5.173124
t(Model5_thetahat)
##      [,1]     [,2]      [,3]      [,4]
## y 454.365 1.349107 -10.60533 -5.173124

Task 2.2

#Calculating Y-hat and RSS for each model

#Calculating Y-hat and RSS Model 1
Y_hat_model1 = X_model1 %*% Model1_thetahat
head(Y_hat_model1)
##             y
## [1,] 466.5050
## [2,] 450.4221
## [3,] 449.7365
## [4,] 456.9411
## [5,] 470.1348
## [6,] 470.6422
#Calculating RSS
RSS_Model_1=sum((Y-Y_hat_model1)^2)
head(RSS_Model_1)
## [1] 657248.2
# Calculating Y-hat and RSS of model 2
Y_hat_model2 = X_model2 %*% Model2_thetahat
head(Y_hat_model2)
##             y
## [1,] 468.5275
## [2,] 450.7257
## [3,] 444.3082
## [4,] 457.0911
## [5,] 473.4813
## [6,] 471.7329
#Calculating RSS
RSS_Model_2=sum((Y-Y_hat_model2)^2)
head(RSS_Model_2)
## [1] 602347.1
# Calculating Y-hat and RSS of model 3
Y_hat_model3 = X_model3 %*% Model3_thetahat
head(Y_hat_model3)
##             y
## [1,] 469.4610
## [2,] 448.7972
## [3,] 444.5670
## [4,] 455.6606
## [5,] 475.0874
## [6,] 472.8065
#Calculating RSS
RSS_Model_3=sum((Y-Y_hat_model3)^2)
head(RSS_Model_3)
## [1] 547491.6
# Calculating Y-hat and RSS of model 4
Y_hat_model4 = X_model4 %*% Model4_thetahat
head(Y_hat_model4)
##             y
## [1,] 468.7679
## [2,] 450.2194
## [3,] 445.6921
## [4,] 456.5778
## [5,] 474.5934
## [6,] 471.6094
#Calculating RSS
RSS_Model_4=sum((Y-Y_hat_model4)^2)
head(RSS_Model_4)
## [1] 603630.7
# Calculating Y-hat and RSS of model 5
Y_hat_model5 = X_model5 %*% Model5_thetahat
head(Y_hat_model5)
##             y
## [1,] 472.6947
## [2,] 448.5308
## [3,] 436.3430
## [4,] 458.1852
## [5,] 471.6113
## [6,] 469.7607
#Calculating RSS
RSS_Model_5=sum((Y-Y_hat_model5)^2)
head(RSS_Model_5)
## [1] 365625

#printing RSS value

model1 <- c(RSS_Model_1)
model2 <- c(RSS_Model_2)
model3 <- c(RSS_Model_3)
model4 <- c(RSS_Model_4)
model5 <- c(RSS_Model_5)

dfRSS <- data.frame(model1, model2,model3,model4,model5)
dfRSS
##     model1   model2   model3   model4 model5
## 1 657248.2 602347.1 547491.6 603630.7 365625

#Task 2.3 Calculating likelihood and Variance of each model

N=length(Y)

#Calculating the Variance of Model 1
Variance_model1=RSS_Model_1/(N-1)
Variance_model1
## [1] 68.69951
#Calculating the log-likelihood of Model 1
likehood_Model_1=
  -(N/2)*(log(2*pi))-(N/2)*(log(Variance_model1))-(1/(2*Variance_model1))*RSS_Model_1
likehood_Model_1
## [1] -33810.99
#Calculating Variance and log-likelihood of Model 2
Variance_model2=RSS_Model_2/(N-1)
Variance_model2
## [1] 62.96091
likehood_Model_2=
  -(N/2)*(log(2*pi))-(N/2)*(log(Variance_model2))-(1/(2*Variance_model2))*RSS_Model_2
likehood_Model_2
## [1] -33393.69
#Calculating Variance and log-likelihood of Model 3
Variance_model3=RSS_Model_3/(N-1)
Variance_model3
## [1] 57.2271
likehood_Model_3=
  -(N/2)*(log(2*pi))-(N/2)*(log(Variance_model3))-(1/(2*Variance_model3))*RSS_Model_3
likehood_Model_3
## [1] -32936.88
#Calculating Variance and log-likelihood of Model 4
Variance_model4=RSS_Model_4/(N-1)
Variance_model4
## [1] 63.09509
likehood_Model_4=
  -(N/2)*(log(2*pi))-(N/2)*(log(Variance_model4))-(1/(2*Variance_model4))*RSS_Model_4
likehood_Model_4
## [1] -33403.88
#Calculating Variance and log-likelihood of Model 5
Variance_model5=RSS_Model_5/(N-1)
Variance_model5
## [1] 38.21731
likehood_Model_5=
  -(N/2)*(log(2*pi))-(N/2)*(log(Variance_model5))-(1/(2*Variance_model5))*RSS_Model_5
likehood_Model_5
## [1] -31005.4

#printing variance values

model1 <- c(Variance_model1)
model2 <- c(Variance_model2)
model3 <- c(Variance_model3)
model4 <- c(Variance_model4)
model5 <- c(Variance_model5)

dfVariance <- data.frame(model1, model2,model3,model4,model5)
dfVariance
##     model1   model2  model3   model4   model5
## 1 68.69951 62.96091 57.2271 63.09509 38.21731

#printing likelihood values

model1 <- c(likehood_Model_1)
model2 <- c(likehood_Model_2)
model3 <- c(likehood_Model_3)
model4 <- c(likehood_Model_4)
model5 <- c(likehood_Model_5)

dfLikelihood <- data.frame(model1, model2,model3,model4,model5)
dfLikelihood
##      model1    model2    model3    model4   model5
## 1 -33810.99 -33393.69 -32936.88 -33403.88 -31005.4

Task 2.4

Calculating AIC And BIC of each model

# Calculating AIC and BIC of model 1
K_model1<-length(Model1_thetahat)
K_model1
## [1] 3
AIC_model1=2*K_model1-2*likehood_Model_1
AIC_model1
## [1] 67627.98
BIC_model1=K_model1*log(N)-2*likehood_Model_1
BIC_model1
## [1] 67649.48
## thetahat of model 2
K_model2<-length(Model2_thetahat)
K_model2
## [1] 4
##Calculating AIC and BIC of model 2
AIC_model2=2*K_model2-2*likehood_Model_2
AIC_model2
## [1] 66795.38
BIC_model2=K_model2*log(N)-2*likehood_Model_2
BIC_model2
## [1] 66824.05
## thetahat of model 3
K_model3<-length(Model3_thetahat)
K_model3
## [1] 4
##Calculating AIC and BIC of model 3
AIC_model3=2*K_model3-2*likehood_Model_3
AIC_model3
## [1] 65881.77
BIC_model3=K_model3*log(N)-2*likehood_Model_3
BIC_model3
## [1] 65910.43
## thetahat of model 4
K_model4<-length(Model4_thetahat)
K_model4
## [1] 4
##Calculating AIC and BIC of model 4
AIC_model4=2*K_model4-2*likehood_Model_4
AIC_model4
## [1] 66815.75
BIC_model4=K_model4*log(N)-2*likehood_Model_4
BIC_model4
## [1] 66844.42
## thetahat of model 5
K_model5<-length(Model5_thetahat)
K_model5
## [1] 4
##Calculating AIC and BIC of model 5
AIC_model5=2*K_model5-2*likehood_Model_5
AIC_model5
## [1] 62018.79
BIC_model5=K_model5*log(N)-2*likehood_Model_5
BIC_model5
## [1] 62047.46

#printing K values

model1 <- c(K_model1)
model2 <- c(K_model2)
model3 <- c(K_model3)
model4 <- c(K_model4)
model5 <- c(K_model5)

dfK <- data.frame(model1, model2,model3,model4,model5)
dfK
##   model1 model2 model3 model4 model5
## 1      3      4      4      4      4

#printing AIC values

model1 <- c(AIC_model1)
model2 <- c(AIC_model2)
model3 <- c(AIC_model3)
model4 <- c(AIC_model4)
model5 <- c(AIC_model5)

dfAIC <- data.frame(model1, model2,model3,model4,model5)
dfAIC
##     model1   model2   model3   model4   model5
## 1 67627.98 66795.38 65881.77 66815.75 62018.79

#printing BIC values

model1 <- c(BIC_model1)
model2 <- c(BIC_model2)
model3 <- c(BIC_model3)
model4 <- c(BIC_model4)
model5 <- c(BIC_model5)

dfBIC <- data.frame(model1, model2,model3,model4,model5)
dfBIC
##     model1   model2   model3   model4   model5
## 1 67649.48 66824.05 65910.43 66844.42 62047.46

Task 2.5 calculating error plotting normal/gaussian distibution of each plot

par(mfrow=c(1,1))

## Error of model1
model1_error <- Y-Y_hat_model1
head(model1_error)
##                y
## [1,]  13.9749910
## [2,]  -4.6721095
## [3,] -10.9765113
## [4,]  -3.8510601
## [5,]  -5.7048141
## [6,]   0.3178101
## Plotting the graph QQplot and QQ line of model 1
qqnorm(model1_error, col = "#47b86d",main = "QQ plot of Model 1 (y = θ1x4 + θ2x3² + θbias)")
qqline(model1_error, col = "#1130ee",lwd=1)

## Error of model2
model2_error <- Y-Y_hat_model2 # error of model 2
## Plotting QQplot and QQ line of model 2
qqnorm(model2_error, col = "#47b86d",main = "QQ plot of Model 2 (y = θ1x4 + θ2x3² + θ3x5 + θbias)")
qqline(model2_error, col = "#1130ee")

## Error of model3
model3_error <- Y- Y_hat_model3
## Plotting QQplot and QQ line of model 3
qqnorm(model3_error, col = "#47b86d",main = "QQ plot of Model 3 (y = θ1x3 + θ2x4 + θ3x5 + θbias)")
qqline(model3_error, col = "#1130ee")

## Error of model4
model4_error <- Y-Y_hat_model4
## Plotting QQplot and QQ line of model 4
qqnorm(model4_error, col = "#47b86d",main = "QQ plot of Model 4 (y = θ1x4 + θ2x3² + θ3x5³ + θbias)")
qqline(model4_error, col = "#1130ee")

## Error of model5
model5_error <- Y- Y_hat_model5
## Plotting QQplot and QQ line of model 5
qqnorm(model5_error, col = "#47b86d",main = "QQ plot of Model 5 (y = θ1x4 + θ2x1² + θ3x3³ + θbias)")
qqline(model5_error, col = "#1130ee")

Task 2.7 splitting data into training and testing dataset and calculating estamation based on training dataset

#also plotting normal distribution graph of training data

 # Function to calculate Z-scores
calculate_z_scores <- function(data) {
  if(is.matrix(data) || is.data.frame(data)) {
    return(apply(data, 2, function(x) abs((x - mean(x)) / sd(x))))
  } else {
    return(abs((data - mean(data)) / sd(data)))
  }
}

# Calculate Z-scores for input variables X
z_scores_X <- calculate_z_scores(X)

# Identify rows where any X variable has a Z-score > 3.0
outlier_rows_X <- which(apply(z_scores_X, 1, function(row) any(row > 3.0)))

# Print summary of outliers in predictors only
cat("Number of outliers in input variables:", length(outlier_rows_X), "\n")
## Number of outliers in input variables: 58
cat("Percentage of data with outliers:", round(100 * length(outlier_rows_X) / nrow(X), 2), "%\n")
## Percentage of data with outliers: 0.61 %
# Remove outliers from both X and Y (maintaining correspondence)
X_clean <- X[-outlier_rows_X, ]
Y_clean <- Y[-outlier_rows_X, ]

# Verify dimensions match
cat("\nDimensions after outlier removal:\n")
## 
## Dimensions after outlier removal:
cat("X dimensions:", dim(X_clean), "\n")
## X dimensions: 9510 4
cat("Y dimensions:", length(Y_clean), "\n")
## Y dimensions: 9510
cat("Remaining data points:", nrow(X_clean), "\n")
## Remaining data points: 9510
# Optional: Print statistical summary of cleaned data
cat("\nSummary of cleaned input variables:\n")
## 
## Summary of cleaned input variables:
print(colMeans(X_clean))
##         x1         x3         x4         x5 
##   19.69280   54.36676 1013.19211   73.32460
cat("\nSummary of cleaned target variable:\n")
## 
## Summary of cleaned target variable:
cat("Mean:", mean(Y_clean), "\n")
## Mean: 454.2696
cat("SD:", sd(Y_clean), "\n")
## SD: 17.03703
## Spliting the dataset y into  Traning and testing data set.
split_Y<-initial_split(data = as.data.frame(Y),prop=.7)
## Traning splitted Y dataset 
Y_training_set<-training(split_Y)
Y_testing_set<-as.matrix(testing(split_Y))
## Testing splitted Y dataset 
Y_training_data<-as.matrix(Y_training_set)

## Spliting the dataset of X into  Traning and testing data set.
split_X<-initial_split(data = as.data.frame(X),prop=.7)
## Traning splitted X dataset
X_training_set<-training(split_X)
## Testing splitted X dataset 
X_testing_set<-as.matrix(testing(split_X))
X_testing_data<-as.matrix(X_testing_set)
X_training_data<-as.matrix(X_training_set)

# Create raw training predictors for model 5
X_train_model_raw <- cbind(
  x4 = X_training_set[,"x4"],
  x1_sq = X_training_set[,"x1"]^2,
  x3_sq = X_training_set[,"x3"]^2
)

# Apply scaling
X_train_scaled <- scale(X_train_model_raw)

# Save scaling parameters
center_vals <- attr(X_train_scaled, "scaled:center")
scale_vals  <- attr(X_train_scaled, "scaled:scale")

# Create final training matrix with intercept
training_ones <- matrix(1, nrow(X_train_scaled), 1)
X_traning_model <- cbind(training_ones, X_train_scaled)

# Estimate thetahat
traning_thetahat <- solve(t(X_traning_model) %*% X_traning_model) %*% 
                    t(X_traning_model) %*% Y_training_data

# === Apply same scaling to test set === #
X_test_model_raw <- cbind(
  x4 = X_testing_set[,"x4"],
  x1_sq = X_testing_set[,"x1"]^2,
  x3_sq = X_testing_set[,"x3"]^2
)

# Apply the same centering and scaling from training set
X_test_scaled <- scale(X_test_model_raw, center = center_vals, scale = scale_vals)

# Final test matrix
test_ones <- matrix(1, nrow(X_test_scaled), 1)
X_test_model <- cbind(test_ones, X_test_scaled)

### Estimating model parameters using Traning set
#traning_ones=matrix(1 , length(X_training_set$x1),1)
# selected model 2 and using equation of model 5
#X_traning_model<-cbind(traning_ones,X_training_set[,"x4"],(X_training_set[,"x1"])^2,(X_training_set[,"x3"])^2)

traning_thetahat=solve(t(X_traning_model) %*% X_traning_model) %*% t(X_traning_model) %*%  Y_training_data
  
### Model out/Prediction
Y_testing_hat = X_test_model  %*% traning_thetahat
head(Y_testing_hat)
##             y
## [1,] 453.8734
## [2,] 454.4426
## [3,] 454.5583
## [4,] 454.4864
## [5,] 454.1396
## [6,] 454.4734
RSS_testing <- sum((Y_testing_set-Y_testing_hat)^2)
head(RSS_testing)
## [1] 841630.7
t.test(X_traning_model, mu=500, alternative="two.sided", conf.level=0.95)
## 
##  One Sample t-test
## 
## data:  X_traning_model
## t = -84480, df = 26787, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 500
## 95 percent confidence interval:
##  0.2384051 0.2615949
## sample estimates:
## mean of x 
##      0.25
C_I1=-0.2783950
C_I2=0.2762101
p2 <- plot(density(X_traning_model), col="#1130ee", lwd=2,
          main="Distribution of Traning Data")
abline(v=C_I1,col="#996668", lty=2)
abline(v=C_I2,col="#996668", lty=2)

thetaHat_training =solve(t(X_training_data) %*% X_training_data) %*% t(X_training_data) %*%Y_training_data
thetaHat_training
##              y
## x1  0.37277590
## x3 -0.05581042
## x4  0.43921267
## x5  0.06901410
length(thetaHat_training)
## [1] 4
dis_test=density(Y_training_data)
plot((dis_test), main = "Density plot of Output Training Data", col = "#1130ee")

# === Proper 95% Confidence Interval Calculation === #
# 1. Calculate residuals and estimate error variance from training data
Y_training_hat <- X_traning_model %*% traning_thetahat
residuals <- Y_training_data - Y_training_hat
RSS_train <- sum(residuals^2)
n_train <- nrow(X_traning_model)
p <- ncol(X_traning_model)  # Number of parameters (including intercept)
sigma_squared <- RSS_train / (n_train - p)  # Residual variance estimate

# 2. Calculate parameter covariance matrix
param_cov_matrix <- sigma_squared * solve(t(X_traning_model) %*% X_traning_model)

# 3. Calculate prediction intervals for each test point
n_test <- nrow(X_test_model)
lower_CI <- numeric(n_test)
upper_CI <- numeric(n_test)
t_crit <- qt(0.975, df = n_train - p)  # Critical t-value for 95% CI

for (i in 1:n_test) {
  x_i <- X_test_model[i,]
  
  # Prediction variance has two components:
  # 1. Variance due to parameter uncertainty: x_i' * (X'X)^(-1) * x_i * sigma^2
  # 2. Variance due to random error: sigma^2
  pred_var_params <- t(x_i) %*% param_cov_matrix %*% x_i
  pred_variance <- sigma_squared + as.numeric(pred_var_params)
  
  # Calculate margin of error
  margin <- t_crit * sqrt(pred_variance)
  
  # Store CI bounds
  lower_CI[i] <- Y_testing_hat[i] - margin
  upper_CI[i] <- Y_testing_hat[i] + margin
}

# === Create Required Visualization === #
# Create data frame for plotting
plot_data <- data.frame(
  Index = 1:n_test,
  Actual = Y_testing_set,
  Predicted = Y_testing_hat,
  Lower_CI = lower_CI,
  Upper_CI = upper_CI
)

# Calculate performance metrics
MSE <- mean((Y_testing_set - Y_testing_hat)^2)
RMSE <- sqrt(MSE)
R_squared <- 1 - sum((Y_testing_set - Y_testing_hat)^2) / sum((Y_testing_set - mean(Y_testing_set))^2)

cat("Model 5 Performance Metrics:\n")
## Model 5 Performance Metrics:
cat("MSE:", MSE, "\n")
## MSE: 293.149
cat("RMSE:", RMSE, "\n")
## RMSE: 17.12159
cat("R-squared:", R_squared, "\n")
## R-squared: -5.585531e-05
# Create plot using base R
par(mar = c(5, 5, 4, 2))  # Increase left margin for y-axis label

# Sort data points by actual values for clearer visualization
sorted_indices <- order(Y_testing_set)

# Plot actual vs. sample index
plot(plot_data$Index[sorted_indices], 
     Y_testing_set[sorted_indices],
     type = "p", 
     pch = 16,
     col = "#47b86d",
     xlab = "Test Sample Index", 
     ylab = "Energy Output (MW)",
     main = "Model 5: Predictions with 95% Confidence Intervals",
     ylim = range(c(lower_CI, upper_CI, Y_testing_set)))

# Add prediction line
lines(plot_data$Index[sorted_indices], 
      Y_testing_hat[sorted_indices], 
      col = "#1130ee", 
      lwd = 2)

# Add confidence interval as a shaded area
polygon(c(plot_data$Index[sorted_indices], rev(plot_data$Index[sorted_indices])),
        c(lower_CI[sorted_indices], rev(upper_CI[sorted_indices])),
        col = rgb(0.7, 0.7, 0.7, 0.3), 
        border = NA)

# Add error bars for select points (to avoid overcrowding)
for (i in seq(1, length(sorted_indices), 10)) {
  idx <- sorted_indices[i]
  segments(plot_data$Index[idx], lower_CI[idx], 
           plot_data$Index[idx], upper_CI[idx], 
           col = "gray40")
}

# Add legend
legend("topleft",
       legend = c("Actual Values", "Predicted Values", "95% Confidence Interval"),
       col = c("#47b86d", "#1130ee", "gray70"),
       pch = c(16, NA, 15),
       lty = c(NA, 1, NA),
       pt.cex = c(1, 0, 2),
       bg = "white")

# Add result text
mtext(paste("RMSE =", round(RMSE, 2), 
            "   R² =", round(R_squared, 3)), 
      side = 3, 
      line = 0.5, 
      cex = 0.8)

#Task 3

## Model 5 will be used, parameter are selected and kept constant.
arr_1=0
arr_2=0
f_value=0
s_value=0
Model5_thetahat
##               y
## [1,] 454.365009
## [2,]   1.349107
## [3,] -10.605331
## [4,]  -5.173124
#values from thetahat
thetebias <- 454.365009 #selected parameter
thetaone <- 1.349107 # selected parameter
thetatwo <- -10.605331 # constant value
thetathree <- -5.173124 # constant value


Epison <- RSS_Model_5 * 2 ## fixing value of eplision
num <- 100 #number of iteration
##Calculating Y-hat for performing rejection ABC
counter <- 0
for (i in 1:num) {
  range1 <- runif(1, -454.365009, 454.365009) # calculating the range
  range1
  range2 <- runif(1, -1.349107, 1.349107)
  New_thetahat <- matrix(c(range1,range2,thetatwo,thetathree))
  New_Y_Hat <- X_model5 %*% New_thetahat ## calculating new Y-hat
  new_RSS <- sum((Y-New_Y_Hat)^2)
  new_RSS
  if (new_RSS > Epison){
    arr_1[i] <- range1
    arr_2[i] <- range2
    counter = counter+1
    f_value <- matrix(arr_1)
    s_value <- matrix(arr_2)
  }
}
hist(f_value , col = "#47b86d")

hist(s_value , col = "#47b86d")

###ploting Joint and Marginal Posterior Distribution of the graph
plot(f_value,s_value, col = c("#47b86d", "#1130ee"), main = "Joint and Marginal Posterior Distribution (Rejection Range)")

par(mfrow=c(1,1))